{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Query database\n",
    "\n",
    "Notebook for running Python sqlite3 commands for querying database.\n",
    "\n",
    "Specific code blocks run separately to produce statistics.\n",
    "\n",
    "Not intended to be run all in one session, search or scroll to find targeted SQL commands."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sqlite3\n",
    "import re\n",
    "import json\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "db_path = os.path.expanduser(\"~/data/db/arxiv_db_images.sqlite3\")\n",
    "# db_path = os.path.expanduser(\"~/data/db/arxiv_db_images_600k.sqlite3\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Here we import the sqlite3 database and create a cursor\n",
    "\n",
    "db = sqlite3.connect(db_path)\n",
    "c = db.cursor()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Testing\n",
    "\n",
    "Get the pragma table info for each table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "c.execute('PRAGMA TABLE_INFO({})'.format(\"metadata\"))\n",
    "info = c.fetchall()\n",
    "\n",
    "print(\"\\nColumn Info:\\nID, Name, Type, NotNull, DefaultVal, PrimaryKey\")\n",
    "for col in info:\n",
    "    print(col)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c.execute('PRAGMA TABLE_INFO({})'.format(\"images\"))\n",
    "info = c.fetchall()\n",
    "\n",
    "print(\"\\nColumn Info:\\nID, Name, Type, NotNull, DefaultVal, PrimaryKey\")\n",
    "for col in info:\n",
    "    print(col)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Quick test to retrive one row"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c.execute('''\n",
    "    SELECT * \n",
    "    FROM images \n",
    "    ORDER BY ROWID ASC \n",
    "    LIMIT 1\n",
    "''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c.execute('''\n",
    "    SELECT * \n",
    "    FROM metadata \n",
    "    ORDER BY ROWID ASC \n",
    "    LIMIT 1\n",
    "''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c.execute('''\n",
    "    SELECT * \n",
    "    FROM captions \n",
    "    ORDER BY ROWID ASC \n",
    "    LIMIT 2\n",
    "''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Writing to org mode\n",
    "\n",
    "This function takes a list of tuples from an SQL query and writes this data into an org table with pipes and dashes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def write_to_org(data, _write_file):\n",
    "    with open(_write_file, \"w+\") as write_file:\n",
    "        for row in data:\n",
    "            print(\"|\", file=write_file, end = '')\n",
    "            for item in row:\n",
    "                print(str(item).replace(\"\\n\", \" \"), file=write_file, end = '')\n",
    "                print(\"|\", file=write_file, end = '')\n",
    "            print(\"\\n\", file=write_file, end = '')\n",
    "        write_file.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get samples from each table\n",
    "\n",
    "Find a few random entries from both metadata and images, write to org mode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor = c.execute('''\n",
    "    SELECT * \n",
    "    FROM metadata \n",
    "    ORDER BY RANDOM() \n",
    "    LIMIT 3\n",
    "''')\n",
    "rows = c.fetchall()\n",
    "headings = [description[0] for description in cursor.description]\n",
    "rows.insert(0, headings)\n",
    "\n",
    "write_to_org(rows, \"metadata_samples.org\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor = c.execute('''\n",
    "    SELECT * \n",
    "    FROM images \n",
    "    ORDER BY RANDOM() \n",
    "    LIMIT 3\n",
    "''')\n",
    "rows = c.fetchall()\n",
    "headings = [description[0] for description in cursor.description]\n",
    "rows.insert(0, headings)\n",
    "\n",
    "write_to_org(rows, \"images_samples.org\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor = c.execute('''\n",
    "    SELECT * \n",
    "    FROM captions \n",
    "    ORDER BY RANDOM() \n",
    "    LIMIT 3\n",
    "''')\n",
    "rows = c.fetchall()\n",
    "headings = [description[0] for description in cursor.description]\n",
    "rows.insert(0, headings)\n",
    "\n",
    "write_to_org(rows, \"captions_samples.org\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Find duplicate entries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# look for duplicate rows in the metadata table\n",
    "c.execute('''\n",
    "    SELECT identifier, COUNT(identifier)\n",
    "    FROM metadata\n",
    "    GROUP BY identifier\n",
    "    HAVING COUNT(identifier) > 1\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)\n",
    "    \n",
    "duplicates = rows\n",
    "\n",
    "sql = ('''\n",
    "    SELECT id, identifier, created, cat, authors, title\n",
    "    FROM metadata\n",
    "    WHERE identifier IS ?\n",
    "''')\n",
    "\n",
    "for d in duplicates[:]:\n",
    "    c.execute(sql, (d[0], ))\n",
    "    rows = c.fetchall()\n",
    "    \n",
    "    same = True\n",
    "    r1 = rows[0][1:]\n",
    "    for row in rows[1:]:\n",
    "        if r1 != row[1:]:\n",
    "            same = False\n",
    "    if same is False:\n",
    "        print(\"-----\",d[0])\n",
    "        for row in rows:\n",
    "            print(row)\n",
    "        print(\"----- !!! mismatch !!! -----\")\n",
    "#     else:\n",
    "#         print(\"*** entry identitical ***\")\n",
    "\n",
    "print(f'total number of duplicate entries: {len(rows)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# look for duplicate rows in the images table\n",
    "# this brings up totals for images by article ID\n",
    "c.execute('''\n",
    "    SELECT identifier, COUNT(identifier)\n",
    "    FROM images\n",
    "    GROUP BY identifier\n",
    "    HAVING COUNT(identifier) > 0\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# look for duplicate images (search by filename, filesize, and identifier)\n",
    "# there is quite a lot of these, but that is how the source data was uploaded\n",
    "c.execute('''\n",
    "    SELECT identifier, filename, filesize, path, COUNT(*) c\n",
    "    FROM images\n",
    "    GROUP BY identifier, filename, filesize\n",
    "    HAVING c > 1\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)\n",
    "print(f'total number of duplicate images: {len(rows)}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Database queries for statistics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Earliest dates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Find the earliest date of an article\n",
    "\n",
    "c.execute('''\n",
    "    SELECT created, identifier\n",
    "    FROM metadata\n",
    "    WHERE created IS NOT NULL\n",
    "    ORDER BY created ASC\n",
    "    LIMIT 20\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Find the earliest dated articles and get the associated images\n",
    "\n",
    "c.execute('''\n",
    "    SELECT images.id, metadata.created, metadata.identifier \n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier\n",
    "    WHERE metadata.created IS NOT NULL\n",
    "    ORDER BY created ASC\n",
    "    LIMIT 40\n",
    "    ''')\n",
    "    \n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Total numbers of articles and images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find total number of rows in metadata\n",
    "\n",
    "c.execute('''\n",
    "    SELECT count(*)\n",
    "    FROM metadata\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find total number of rows in images\n",
    "\n",
    "c.execute('''\n",
    "    SELECT count(*)\n",
    "    FROM images\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find total number of rows in images \n",
    "# (filtering out images without size or imageformat)\n",
    "\n",
    "c.execute('''\n",
    "    SELECT count(*)\n",
    "    FROM images\n",
    "    WHERE x IS NOT null AND x != ''\n",
    "    AND y IS NOT null AND y != ''\n",
    "    AND imageformat is not null AND imageformat != ''\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Average image sizes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get average of the x size column\n",
    "\n",
    "c.execute('''\n",
    "    SELECT avg(x)\n",
    "    FROM images\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get average of the y size column\n",
    "\n",
    "c.execute('''\n",
    "    SELECT avg(y)\n",
    "    FROM images\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get median of the x size column\n",
    "\n",
    "c.execute('''\n",
    "    SELECT x\n",
    "    FROM images\n",
    "    ORDER by x\n",
    "    LIMIT 1\n",
    "    OFFSET (SELECT COUNT(*) FROM images) / 2\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get median of the y size column\n",
    "\n",
    "c.execute('''\n",
    "    SELECT y\n",
    "    FROM images\n",
    "    ORDER by y\n",
    "    LIMIT 1\n",
    "    OFFSET (SELECT COUNT(*) FROM images) / 2\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Missing fields"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find where there are NULL values for x\n",
    "\n",
    "c.execute('''\n",
    "    SELECT id, identifier, path, filename, filesize, x, y, imageformat\n",
    "    FROM images\n",
    "    WHERE x is null or x = ''\n",
    "    OR y is null or y = ''\n",
    "    OR imageformat is null or imageformat = ''\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(len(rows))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# generate a text file listing paths of files that couldn't be identified\n",
    "with open(\"identify_errors_db.txt\", \"a+\") as f:\n",
    "    for row in rows[:]:\n",
    "        filepath = row[2][1:] + \"/\" + row[3]\n",
    "        print(str(row[0]) + \",\" + filepath)\n",
    "        f.write(str(row[0]) + \",\" + filepath + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find where there are NULL values for y\n",
    "\n",
    "c.execute('''\n",
    "    SELECT id, identifier, path, filename, filesize, x, y, imageformat\n",
    "    FROM images\n",
    "    WHERE y is null or y = ''\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(len(rows))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find where there are NULL values for identifier\n",
    "\n",
    "c.execute('''\n",
    "    SELECT id, identifier, path, filesize, x, y, imageformat\n",
    "    FROM images\n",
    "    WHERE identifier is null or identifier = ''\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)\n",
    "print(f'total number of entries found: {len(rows)}')\n",
    "# no result is a good result!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find where there are NULL values for filesize\n",
    "\n",
    "c.execute('''\n",
    "    SELECT id, identifier, path, filesize, x, y, imageformat\n",
    "    FROM images\n",
    "    WHERE filesize is null or filesize = ''\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)\n",
    "print(f'total number of entries found: {len(rows)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get number of articles with no category\n",
    "# this seems to give no results\n",
    "\n",
    "c.execute('''\n",
    "    SELECT identifier, cat\n",
    "    FROM metadata\n",
    "    WHERE cat IS NULL OR cat = '' OR cat = 'None'\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find where there is no matching metadata for a given image\n",
    "\n",
    "c.execute('''\n",
    "    SELECT count(images.identifier), images.identifier\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON metadata.identifier = images.identifier\n",
    "    WHERE metadata.identifier IS NULL\n",
    "    GROUP BY images.identifier\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Averages of images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get total number of images per article\n",
    "\n",
    "c.execute('''\n",
    "    SELECT images.identifier, metadata.cat, count(images.identifier)    \n",
    "    FROM images \n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier \n",
    "    GROUP BY images.identifier\n",
    "    ORDER BY count(images.identifier)\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Get total number of images per category\n",
    "\n",
    "c.execute('''\n",
    "    SELECT metadata.cat, count(images.identifier)\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier \n",
    "    GROUP BY metadata.cat\n",
    "    ORDER BY count(images.identifier) DESC    \n",
    "    LIMIT 200\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Get total number of images per primary category only\n",
    "\n",
    "c.execute('''\n",
    "    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(images.identifier)\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier \n",
    "    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)\n",
    "    ORDER BY count(images.identifier) DESC    \n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)\n",
    "write_to_org(rows, \"stats_images_by_category.org\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Get total number of images per primary category only - with a specific filter\n",
    "\n",
    "c.execute('''\n",
    "    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) c, count(images.identifier)\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier\n",
    "    WHERE c = \"cond-mat.mes-hall\"\n",
    "    GROUP BY c\n",
    "    ORDER BY count(images.identifier) DESC    \n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find total number of images in a given category and year\n",
    "\n",
    "c.execute('''\n",
    "    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) c, \n",
    "            count(images.identifier), \n",
    "            strftime(\"%Y\", metadata.created) as 'Y'\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier\n",
    "    WHERE c = \"cs.CV\"\n",
    "    AND Y = '2019'\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get total number of images per primary category with multiple filters\n",
    "\n",
    "c.execute('''\n",
    "    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) c, count(images.identifier)\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier\n",
    "    WHERE c = \"cs.LG\"\n",
    "    OR c = \"cs.CV\"\n",
    "    OR c = \"stat.ML\"\n",
    "    OR c = \"cs.AI\"\n",
    "    GROUP BY c\n",
    "    ORDER BY count(images.identifier) DESC    \n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get total number of images per primary category\n",
    "\n",
    "c.execute('''\n",
    "    SELECT substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1), count(images.identifier)\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier\n",
    "    GROUP BY substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1)\n",
    "    ORDER BY count(images.identifier) DESC    \n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for row in rows:\n",
    "    print(str(row[1]) + \" \\t \" + str(row[0]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Totals by month/year"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find number of articles by year\n",
    "\n",
    "c.execute('''\n",
    "    SELECT count(metadata.identifier), strftime(\"%Y\", metadata.created) as 'Y'\n",
    "    FROM metadata\n",
    "    GROUP BY strftime(\"%Y\", metadata.created)\n",
    "    ORDER BY strftime(\"%Y\", metadata.created) ASC\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find number of articles in each month/year\n",
    "\n",
    "c.execute('''\n",
    "    SELECT count(metadata.identifier), strftime(\"%m-%Y\", metadata.created) as 'mY'\n",
    "    FROM metadata\n",
    "    GROUP BY strftime(\"%m-%Y\", metadata.created)\n",
    "    ORDER BY strftime(\"%Y\", metadata.created) ASC\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find number of images by year\n",
    "\n",
    "c.execute('''\n",
    "    SELECT count(images.identifier), strftime(\"%Y\", metadata.created) as 'Y'\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier \n",
    "    GROUP BY strftime(\"%Y\", metadata.created)\n",
    "    ORDER BY strftime(\"%Y\", metadata.created) ASC\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find number of images in each month/year\n",
    "\n",
    "c.execute('''\n",
    "    SELECT count(images.identifier), strftime(\"%m-%Y\", metadata.created) as 'mY'\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier \n",
    "    GROUP BY strftime(\"%m-%Y\", metadata.created)\n",
    "    ORDER BY strftime(\"%Y\", metadata.created) ASC\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "total = 0\n",
    "for row in rows:\n",
    "    total += row[0]\n",
    "print(total)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### ...and by category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find number of articles by year by category - cs.CV\n",
    "\n",
    "c.execute('''\n",
    "    SELECT count(metadata.identifier), strftime(\"%Y\", metadata.created) as 'Y'\n",
    "    FROM metadata\n",
    "    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = 'cs.CV'\n",
    "    GROUP BY strftime(\"%Y\", metadata.created)\n",
    "    ORDER BY strftime(\"%Y\", metadata.created) ASC\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find number of articles by year by category, for a specific query\n",
    "\n",
    "c.execute('''\n",
    "    SELECT count(metadata.identifier), strftime(\"%Y\", metadata.created) as 'Y'\n",
    "    FROM metadata\n",
    "    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = 'cs.CV'\n",
    "    AND strftime(\"%Y\", metadata.created) = '2018'\n",
    "    GROUP BY strftime(\"%Y\", metadata.created)\n",
    "    ORDER BY strftime(\"%Y\", metadata.created) ASC\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find number of articles by year by category, for a specific query\n",
    "\n",
    "c.execute('''\n",
    "    SELECT count(metadata.identifier), strftime(\"%Y\", metadata.created) as 'Y'\n",
    "    FROM metadata\n",
    "    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = 'cs.CV'\n",
    "    AND strftime(\"%Y\", metadata.created) = '2012'\n",
    "    GROUP BY strftime(\"%Y\", metadata.created)\n",
    "    ORDER BY strftime(\"%Y\", metadata.created) ASC\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Other metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find number of images per each different imageformat\n",
    "\n",
    "c.execute('''\n",
    "    SELECT imageformat, count(imageformat)\n",
    "    FROM images\n",
    "    GROUP BY imageformat\n",
    "    ORDER BY count(imageformat) DESC\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get only the first listed category\n",
    "\n",
    "c.execute('''\n",
    "    SELECT identifier, cat, substr(trim(cat),1,instr(trim(cat)||' ',' ')-1)\n",
    "    FROM metadata\n",
    "    LIMIT 20\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the licence information for number of articles\n",
    "\n",
    "c.execute('''\n",
    "    SELECT licence, COUNT(licence)\n",
    "    FROM metadata\n",
    "    GROUP BY licence\n",
    "    HAVING COUNT(identifier) > 0\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)\n",
    "write_to_org(rows, \"licences.org\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the number of images per licence\n",
    "\n",
    "c.execute('''\n",
    "    SELECT metadata.licence, COUNT(metadata.licence)\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier\n",
    "    GROUP BY metadata.licence\n",
    "    HAVING COUNT(images.identifier) > 0\n",
    "    ORDER BY COUNT(metadata.licence)\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)\n",
    "write_to_org(rows, \"image_licences.org\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get number of articles for each primary category\n",
    "\n",
    "c.execute('''\n",
    "    SELECT count(substr(trim(cat),1,instr(trim(cat)||' ',' ')-1)) AS cnt, substr(trim(cat),1,instr(trim(cat)||' ',' ')-1)\n",
    "    FROM metadata\n",
    "    GROUP BY substr(trim(cat),1,instr(trim(cat)||' ',' ')-1)\n",
    "    ORDER BY cnt DESC\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(str(row[0]) + \" \\t\" + row[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Querying by image extension"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define reverse function\n",
    "db.create_function(\"reverse\", 1, lambda s: s[::-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the number of images per extension\n",
    "# DON'T filter based on X or Y dimension\n",
    "\n",
    "c.execute('''\n",
    "    SELECT COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1))), reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension\n",
    "    FROM images\n",
    "    GROUP BY extension \n",
    "    ''')\n",
    "\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the number of images per extension\n",
    "# filter based on X or Y dimension\n",
    "\n",
    "c.execute('''\n",
    "    SELECT COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1))), reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension\n",
    "    FROM images\n",
    "    WHERE x is not null and x != ''\n",
    "    AND y is not null and y != ''\n",
    "    AND imageformat is not null and imageformat != ''\n",
    "    GROUP BY extension\n",
    "    ''')\n",
    "\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the number of images per extension by year\n",
    "# filter based on X or Y dimension\n",
    "# 2018 only\n",
    "\n",
    "c.execute('''\n",
    "    SELECT reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension, COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)))\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier\n",
    "    WHERE x is not null and x != ''\n",
    "    AND y is not null and y != ''\n",
    "    AND imageformat is not null and imageformat != ''\n",
    "    AND strftime(\"%Y\", metadata.created) = '2018'\n",
    "    GROUP BY extension\n",
    "    ''')\n",
    "\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the number of images per extension with percentages\n",
    "# filter based on X or Y dimension\n",
    "\n",
    "c.execute('''\n",
    "    SELECT reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension, COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)))\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier\n",
    "    WHERE x is not null and x != ''\n",
    "    AND y is not null and y != ''\n",
    "    AND imageformat is not null and imageformat != ''\n",
    "    GROUP BY extension\n",
    "    ''')\n",
    "\n",
    "rows = c.fetchall()\n",
    "    \n",
    "total = 0\n",
    "for row in rows:\n",
    "    total += row[1]\n",
    "print(\"total:\", total)\n",
    "\n",
    "print(\"*\" * 20)\n",
    "\n",
    "alldata = []\n",
    "for i, row in enumerate(rows):\n",
    "    alldata.append([row[0], row[1], row[1]/total])\n",
    "for d in alldata:\n",
    "    print(\"{} | {} | {:2.2%}\".format(d[0], d[1], d[2]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get extensions by years and get statistics for changing percentages\n",
    "\n",
    "This section is quite a convoluted way of finding the number of images by extension for each year and then calculating the change over time for these. Requires running a number of iterative searches."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get list of years\n",
    "years = []\n",
    "\n",
    "c.execute('''\n",
    "    SELECT strftime(\"%Y\", metadata.created), COUNT(strftime(\"%Y\", metadata.created))\n",
    "    FROM metadata\n",
    "    GROUP BY strftime(\"%Y\", metadata.created)\n",
    "    ''')\n",
    "\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)\n",
    "    years.append(row[0])\n",
    "print(years)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Get the number of images per extension by year\n",
    "# filter based on X or Y dimension\n",
    "\n",
    "data = []\n",
    "\n",
    "# manually selecting years\n",
    "# years = [\"2000\", \"2005\", \"2010\", \"2015\"]\n",
    "\n",
    "db.create_function(\"reverse\", 1, lambda s: s[::-1])\n",
    "sql = ('''\n",
    "    SELECT reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)) AS extension, COUNT(reverse(substr(reverse(filename),1,instr(reverse(filename),'.')-1)))\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier\n",
    "    WHERE x is not null and x != ''\n",
    "    AND y is not null and y != ''\n",
    "    AND imageformat is not null and imageformat != ''\n",
    "    AND strftime(\"%Y\", metadata.created) = ?\n",
    "    GROUP BY extension\n",
    "    ''')\n",
    "\n",
    "# data = []\n",
    "\n",
    "for y, year in enumerate(years):\n",
    "    print(\"*\" * 20)\n",
    "    print(year)\n",
    "    print(\"*\" * 20)\n",
    "    c.execute(sql, (year, ))\n",
    "\n",
    "    rows = c.fetchall()\n",
    "#     for row in rows:\n",
    "#         print(row)\n",
    "    total = 0\n",
    "    for row in rows:\n",
    "        total += row[1]\n",
    "    print(\"total:\", total)\n",
    "\n",
    "    data = []\n",
    "    for i, row in enumerate(rows):\n",
    "        data.append([row[0], row[1], row[1]/total])\n",
    "    for d in data:\n",
    "        print(\"{} | {} | {:2.2%}\".format(d[0], d[1], d[2]))\n",
    "#     print(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for d in data:\n",
    "    print(d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get proportion as percentage\n",
    "\n",
    "total = 0\n",
    "for row in rows:\n",
    "    total += row[1]\n",
    "print(total)\n",
    "\n",
    "data = []\n",
    "for row in rows:\n",
    "    data.append([row[0], row[1], row[1]/total])\n",
    "print(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# take second element for sort\n",
    "def takeSecond(elem):\n",
    "    return elem[1]\n",
    "data.sort(key=takeSecond)\n",
    "\n",
    "for d in data:\n",
    "    print(\"{} | {} | {:2.2%}\".format(d[0], d[1], d[2]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "values = []\n",
    "names = []\n",
    "for row in rows:\n",
    "    values.append(row[0])\n",
    "    names.append(row[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "values_save = values\n",
    "names_save = names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, name in enumerate(names.copy()):\n",
    "    names[i] = name.lower()\n",
    "print(names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(values)\n",
    "print(names)\n",
    "print(len(values))\n",
    "print(len(names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "total = 0\n",
    "for row in rows:\n",
    "    total += row[0]\n",
    "print(total)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "write_to_org(rows, \"stats_articles_by_cat.org\")\n",
    "# print(\"* number of articles by category\", file=write_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Image metadata\n",
    "\n",
    "Here we pull the 'creator' field from the table, with some cleaning using regex. There are two versions, one without the year, and one with years. The version without years creates an ordered and summed list of all image creator fields. The version with years pulls all the creator data and then builds an ordered table for creator fields by year.\n",
    "\n",
    "Note: code is a little sketchy, use with discretion."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get_year = False\n",
    "get_year = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the number of images per creator\n",
    "\n",
    "if get_year is False:\n",
    "    c.execute('''\n",
    "        SELECT images.creator, COUNT(images.creator)\n",
    "        FROM images\n",
    "        GROUP BY images.creator\n",
    "        ORDER BY COUNT(images.creator) DESC\n",
    "        ''')\n",
    "else:\n",
    "    c.execute('''\n",
    "        SELECT images.creator, COUNT(images.creator), strftime(\"%Y\", metadata.created)\n",
    "        FROM images\n",
    "        LEFT JOIN metadata ON images.identifier = metadata.identifier \n",
    "        GROUP BY images.creator, strftime(\"%Y\", metadata.created)\n",
    "        ORDER BY images.creator ASC\n",
    "        ''')\n",
    "rows = c.fetchall()\n",
    "print(len(rows))\n",
    "print(rows[:20])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# remove the ® symbol\n",
    "creators = []\n",
    "\n",
    "if get_year is False:\n",
    "    for i, (creator, count) in enumerate(rows):\n",
    "        if \"®\" in creator:\n",
    "            print(\"found symbol\")\n",
    "            print(creator)\n",
    "            new_creator = creator.replace(\"®\", \"\")\n",
    "            print(new_creator)\n",
    "            creators.append([new_creator, count])\n",
    "        else:\n",
    "            creators.append([creator, count])\n",
    "else:\n",
    "    for i, (creator, count, year) in enumerate(rows):\n",
    "        if \"®\" in creator:\n",
    "            print(\"found symbol\")\n",
    "            print(creator)\n",
    "            new_creator = creator.replace(\"®\", \"\")\n",
    "            print(new_creator)\n",
    "            creators.append([new_creator, count, year])\n",
    "        else:\n",
    "            creators.append([creator, count, year])\n",
    "print(creators[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cleaned = []\n",
    "\n",
    "regex = r'\\n?\\s?(?:http://|www.|edited with|created with the |esp|afpl|windows|adobe|gpl\\s|apple|wolfram|gnu\\s|microsoft office |microsoft|the\\s|version|v\\.|v\\s)?\\s?((?:[\\w\\/]+(?:[\\-]?(?!\\d)))+)(?:\\sversion|v\\.|v\\ |\\,|\\n|\\s|\\-|\\.|\\(|\\))?\\s?'\n",
    "\n",
    "if get_year is False:\n",
    "    for creator, total in creators[:]:\n",
    "        print(creator + \" | \" + str(total))\n",
    "        match = re.search(regex, creator, re.IGNORECASE)\n",
    "        if match:\n",
    "            cleaned.append([match.group(1), total])\n",
    "            print(\">>>\",match.group(1))\n",
    "        else:\n",
    "            cleaned.append([creator, total])\n",
    "else:\n",
    "    for creator, total, year in creators[:]:\n",
    "        print(f'{creator} | {total} | {year}')\n",
    "        match = re.search(regex, creator, re.IGNORECASE)\n",
    "        if match:\n",
    "            cleaned.append([match.group(1), total, year])\n",
    "            print(\">>>\",match.group(1))\n",
    "        else:\n",
    "            cleaned.append([creator, total, year])\n",
    "            \n",
    "for row in cleaned[50:100]:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if get_year is False:\n",
    "    filename = \"creator_cleaned.json\"\n",
    "else:\n",
    "    filename = \"creator_cleaned_years.json\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(filename, 'w') as outfile:\n",
    "    json.dump(cleaned, outfile)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sum_duplicates(a, ind):\n",
    "    length = len(a)\n",
    "    print(\"length:\",length)\n",
    "    \n",
    "    if ind == length:\n",
    "        print(\"ind == length\")\n",
    "        print(\"ending!\")\n",
    "        return 1    \n",
    "    else:\n",
    "        target = a[ind][0]\n",
    "        print(\"target:\",target)\n",
    "\n",
    "        for i, (creator, total) in enumerate(a):\n",
    "            removables = []\n",
    "            if i != ind:\n",
    "#                 print(\"i != ind\")\n",
    "#                 print(\"i:\",i,\"ind:\",ind)\n",
    "                if creator == target:\n",
    "                    print(\"creator == target\")\n",
    "                    print(\"creator\",creator,\"target\",target)\n",
    "                    removables.append(i)\n",
    "#             else:\n",
    "                # if i and ind are the same, don't compare\n",
    "#                 print(\"i == ind\")\n",
    "#                 print(\"i:\",i,\"ind:\",ind)\n",
    "        print(\"removables:\",removables)\n",
    "        for rem in removables:\n",
    "            a[ind][1] += a[rem][1]\n",
    "            del a[rem]\n",
    "        return 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sum up totals\n",
    "# create new array with each entry\n",
    "\n",
    "with open(filename) as infile:\n",
    "    full_list = json.load(infile)\n",
    "    \n",
    "summed = cleaned[:]\n",
    "\n",
    "ind = 0\n",
    "running = True\n",
    "while running:\n",
    "    length = len(summed)\n",
    "    print(\"length:\",length)\n",
    "    \n",
    "    if ind == length:\n",
    "        print(\"ind == length\")\n",
    "        print(\"--- ending! ---\")\n",
    "        running = False # stop if at the end of the list   \n",
    "    else:\n",
    "        target = summed[ind][0]\n",
    "        print(\"target:\",target)\n",
    "        removables = []\n",
    "        if get_year is False:\n",
    "            for i, (creator, total) in enumerate(summed):\n",
    "                if i != ind:\n",
    "                    if creator.lower() == target.lower():\n",
    "                        print(\"i:\", i, \"creator\",creator,\"target\",target)\n",
    "                        removables.append(i)\n",
    "        else:\n",
    "            for i, (creator, total, year) in enumerate(summed):\n",
    "                if i != ind:\n",
    "                    if creator.lower() == target.lower() and year == summed[ind][2]:\n",
    "                        print(\"i:\", i, \"creator:\",creator,\"target:\",target,\"year:\",year)\n",
    "                        removables.append(i)     \n",
    "\n",
    "        print(\"removables:\",removables)\n",
    "\n",
    "        for rem in removables[::-1]:\n",
    "            print(\"removing item\", rem)\n",
    "            summed[ind][1] += summed[rem][1]\n",
    "            del summed[rem]\n",
    "        ind += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print results\n",
    "print(len(summed))\n",
    "for row in summed:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if get_year is False:\n",
    "    summed_filename = \"creator_summed.json\"\n",
    "else:\n",
    "    summed_filename = \"creator_summed_years.json\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(summed_filename, 'w') as outfile:\n",
    "    json.dump(summed, outfile)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(summed_filename, 'r') as infile:\n",
    "    summed = json.load(infile)\n",
    "    \n",
    "sorted_list = sorted(summed, key=lambda x: x[1], reverse=True)\n",
    "print(sorted_list[:20])\n",
    "\n",
    "list_of_creators = []\n",
    "for row in sorted_list:\n",
    "    list_of_creators.append(row[0])\n",
    "    \n",
    "print(list_of_creators)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "write_to_org(sorted_list, \"creator_totals_summed_sorted.org\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "if get_year is True:\n",
    "    ordered_creator_list = []\n",
    "\n",
    "    # take the top 100 creators from the sorted list\n",
    "    for c, n in sorted_list[:100]:\n",
    "        creator_sublist = []\n",
    "        # work by year\n",
    "        for yy in reversed(range(1990, 2019)):\n",
    "    #         print(yy, c)\n",
    "        #     print(c, n)\n",
    "            added = False\n",
    "            for row in summed:\n",
    "                if c.lower() == row[0].lower() and str(yy) == row[2]:\n",
    "                    creator_sublist.append(row)\n",
    "                    added = True\n",
    "            if added == False:\n",
    "                creator_sublist.append([c, 0, yy])\n",
    "    #     print(creator_sublist)\n",
    "        ordered_creator_list.append(creator_sublist)\n",
    "    \n",
    "    for i in range(len(ordered_creator_list)):\n",
    "        ordered_creator_list[i][2] = str(ordered_creator_list[i][2])\n",
    "    \n",
    "    for r in ordered_creator_list[:3]:\n",
    "        print(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "if get_year is True:\n",
    "    with open(\"creator_totals_all_years_test.org\", \"w+\") as write_file:\n",
    "        # first row\n",
    "        print(\"| creator |\", file=write_file, end = '')\n",
    "        for yy in reversed(range(1990, 2019)):\n",
    "            print(str(yy), file=write_file, end = '')\n",
    "            print(\"|\", file=write_file, end = '')\n",
    "        print(\"\\n\", file=write_file, end = '')\n",
    "\n",
    "        for row in ordered_creator_list:\n",
    "            print(\"row:\", row)\n",
    "            print(\"| \", file=write_file, end = '')\n",
    "            print(str(row[0][0]).replace(\"\\n\", \" \"), file=write_file, end = '')\n",
    "            print(\"|\", file=write_file, end = '')\n",
    "        #     print(row)\n",
    "            for yy in reversed(range(1990, 2019)):\n",
    "                for item in row:\n",
    "    #                 print(\"item:\", item)\n",
    "    #                 print(\"yy:\", yy)\n",
    "                    if item[2] == str(yy) or item[2] == yy or str(item[2]) == str(yy):\n",
    "                        print(\">>> match:\", item)\n",
    "                        print(item[1], file=write_file, end = '')\n",
    "                        print(\"|\", file=write_file, end = '')\n",
    "            print(\"\\n\", file=write_file, end = '')\n",
    "    write_file.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Number of images per licence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the number of images per creator -- use a search term for creator\n",
    "\n",
    "c.execute('''\n",
    "    SELECT images.creator, COUNT(images.creator)\n",
    "    FROM images\n",
    "    WHERE images.creator LIKE 'matlab%' COLLATE NOCASE\n",
    "    GROUP BY images.creator\n",
    "    ORDER BY COUNT(images.creator) DESC\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "total = 0\n",
    "for name, num in rows:\n",
    "    total += num\n",
    "print(total)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"|-|-|-|\")\n",
    "for row in rows[1:16]:\n",
    "    per = str(round((row[1]/10053059*100), 2))\n",
    "    print(\"|\" + per + \"|\" + str(row[1]) + \"|\" + row[0] + '|')\n",
    "print(\"|-|-|-|\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get image paths for 2019/2020\n",
    "\n",
    "SQLite query joining tables and finding images of that year, formatted for other scripts in text file that has `filepath,id`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "c.execute('''\n",
    "    SELECT images.id, images.path, images.filename, strftime(\"%Y\", metadata.created) as \"Y\"\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier \n",
    "    WHERE strftime(\"%Y\", metadata.created) = '2019'\n",
    "    OR strftime(\"%Y\", metadata.created) = '2020'\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "print(len(rows))\n",
    "for row in rows:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# write to file\n",
    "\n",
    "writepath = os.path.expanduser(\"~/data/paths/paths_src_update_sqlite.txt\")\n",
    "    \n",
    "with open(writepath, \"w+\") as write_file:\n",
    "    for id_num, path, filename, year in rows[:]:\n",
    "        line = \"{}/{},{}\".format(path, filename, id_num)\n",
    "    #     print(line)\n",
    "        print(line, file=write_file, end = '\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get numbers of images for given years and categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find number of images by year by category, for a specific query (cs.LG)\n",
    "\n",
    "c.execute('''\n",
    "    SELECT count(images.identifier), strftime(\"%Y\", metadata.created) as 'Y'\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier\n",
    "    WHERE substr(trim(cat),1,instr(trim(cat)||' ',' ')-1) = 'cs.LG'\n",
    "    GROUP BY strftime(\"%Y\", metadata.created)\n",
    "    ORDER BY strftime(\"%Y\", metadata.created) ASC\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "total = 0\n",
    "for row in rows:\n",
    "    print(row)\n",
    "    total += row[0]\n",
    "print(f'total number of images: {total}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find number of images by year by category, for a specific query\n",
    "\n",
    "c.execute('''\n",
    "    SELECT count(images.identifier), strftime(\"%Y\", metadata.created) as 'Y'\n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier\n",
    "    GROUP BY strftime(\"%Y\", metadata.created)\n",
    "    ORDER BY strftime(\"%Y\", metadata.created) ASC\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "total = 0\n",
    "for row in rows:\n",
    "    print(row)\n",
    "    total += row[0]\n",
    "print(f'total number of images: {total}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### sanity check for number of papers in a given category and timeframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql = (\"SELECT metadata.cat, metadata.id \"\n",
    "    \"FROM metadata \"\n",
    "    \"WHERE metadata.created BETWEEN date('2012-01-01') \"\n",
    "    \"AND date('2012-12-31') \"\n",
    "    \"AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ?\")\n",
    "\n",
    "c.execute(sql, (\"cs.AI\", ))\n",
    "rows = c.fetchall()\n",
    "\n",
    "print(len(rows))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql = '''SELECT licence, count(licence)\n",
    "    FROM metadata\n",
    "    GROUP BY licence'''\n",
    "\n",
    "c.execute(sql, )\n",
    "rows = c.fetchall()\n",
    "\n",
    "print(len(rows))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Grab predictions\n",
    "\n",
    "Format for JavaScript"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql = '''SELECT substr(trim(vggpred),1,instr(trim(vggpred)||' ',' ')-1) as p, \n",
    "                count(substr(trim(vggpred),1,instr(trim(vggpred)||' ',' ')-1))\n",
    "        FROM images\n",
    "        GROUP BY p\n",
    "        ORDER BY count(p) DESC'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql = '''SELECT substr(trim(authors),1,instr(trim(authors)||' ',',')-1) as p, \n",
    "                count(substr(trim(authors),1,instr(trim(authors)||' ',',')-1))\n",
    "        FROM metadata\n",
    "        GROUP BY p\n",
    "        ORDER BY count(p) DESC'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql = '''SELECT substr(trim(creator),1,instr(trim(creator)||' ',',')-1) as p, \n",
    "                count(substr(trim(creator),1,instr(trim(creator)||' ',',')-1))\n",
    "        FROM images\n",
    "        GROUP BY p\n",
    "        ORDER BY count(p) DESC'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c.execute(sql, )\n",
    "rows = c.fetchall()\n",
    "\n",
    "print(len(rows))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for i, row in enumerate(rows[1:539]):\n",
    "    print(i, row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {}\n",
    "data['options'] = []\n",
    "for row in rows[:250]:\n",
    "    data['options'].append({\n",
    "        'cat': str(row[0]),\n",
    "        'label': str(row[1])\n",
    "    })\n",
    "print(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(summed)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To get the formatted creator field, run the queries from above including cleaning and summing, then use the summed list to print JSON"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
